{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 246,
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score\n",
    "import lightgbm as lgb\n",
    "import xgboost as xgb\n",
    "from sklearn.ensemble import RandomForestRegressor as rfr\n",
    "from sklearn.linear_model import LinearRegression as lr\n",
    "from sklearn.model_selection import  KFold, StratifiedKFold,GroupKFold, RepeatedKFold\n",
    "import warnings"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:30.718129500Z",
     "start_time": "2024-09-27T12:57:30.638399600Z"
    }
   },
   "id": "a00f76b64a06a024"
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "outputs": [
    {
     "data": {
      "text/plain": "   id  loanAmnt  term  interestRate  installment grade subGrade  \\\n0   0   35000.0     5         19.52       917.97     E       E2   \n1   1   18000.0     5         18.49       461.90     D       D2   \n2   2   12000.0     5         16.99       298.17     D       D3   \n3   3   11000.0     3          7.26       340.96     A       A4   \n4   4    3000.0     3         12.99       101.07     C       C2   \n\n   employmentTitle employmentLength  homeOwnership  ...    n5    n6    n7  \\\n0            320.0          2 years              2  ...   9.0   8.0   4.0   \n1         219843.0          5 years              0  ...   NaN   NaN   NaN   \n2          31698.0          8 years              0  ...   0.0  21.0   4.0   \n3          46854.0        10+ years              1  ...  16.0   4.0   7.0   \n4             54.0              NaN              1  ...   4.0   9.0  10.0   \n\n     n8   n9   n10  n11  n12  n13  n14  \n0  12.0  2.0   7.0  0.0  0.0  0.0  2.0  \n1   NaN  NaN  13.0  NaN  NaN  NaN  NaN  \n2   5.0  3.0  11.0  0.0  0.0  0.0  4.0  \n3  21.0  6.0   9.0  0.0  0.0  0.0  1.0  \n4  15.0  7.0  12.0  0.0  0.0  0.0  4.0  \n\n[5 rows x 47 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>loanAmnt</th>\n      <th>term</th>\n      <th>interestRate</th>\n      <th>installment</th>\n      <th>grade</th>\n      <th>subGrade</th>\n      <th>employmentTitle</th>\n      <th>employmentLength</th>\n      <th>homeOwnership</th>\n      <th>...</th>\n      <th>n5</th>\n      <th>n6</th>\n      <th>n7</th>\n      <th>n8</th>\n      <th>n9</th>\n      <th>n10</th>\n      <th>n11</th>\n      <th>n12</th>\n      <th>n13</th>\n      <th>n14</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>35000.0</td>\n      <td>5</td>\n      <td>19.52</td>\n      <td>917.97</td>\n      <td>E</td>\n      <td>E2</td>\n      <td>320.0</td>\n      <td>2 years</td>\n      <td>2</td>\n      <td>...</td>\n      <td>9.0</td>\n      <td>8.0</td>\n      <td>4.0</td>\n      <td>12.0</td>\n      <td>2.0</td>\n      <td>7.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>18000.0</td>\n      <td>5</td>\n      <td>18.49</td>\n      <td>461.90</td>\n      <td>D</td>\n      <td>D2</td>\n      <td>219843.0</td>\n      <td>5 years</td>\n      <td>0</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>13.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>12000.0</td>\n      <td>5</td>\n      <td>16.99</td>\n      <td>298.17</td>\n      <td>D</td>\n      <td>D3</td>\n      <td>31698.0</td>\n      <td>8 years</td>\n      <td>0</td>\n      <td>...</td>\n      <td>0.0</td>\n      <td>21.0</td>\n      <td>4.0</td>\n      <td>5.0</td>\n      <td>3.0</td>\n      <td>11.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>4.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>11000.0</td>\n      <td>3</td>\n      <td>7.26</td>\n      <td>340.96</td>\n      <td>A</td>\n      <td>A4</td>\n      <td>46854.0</td>\n      <td>10+ years</td>\n      <td>1</td>\n      <td>...</td>\n      <td>16.0</td>\n      <td>4.0</td>\n      <td>7.0</td>\n      <td>21.0</td>\n      <td>6.0</td>\n      <td>9.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>3000.0</td>\n      <td>3</td>\n      <td>12.99</td>\n      <td>101.07</td>\n      <td>C</td>\n      <td>C2</td>\n      <td>54.0</td>\n      <td>NaN</td>\n      <td>1</td>\n      <td>...</td>\n      <td>4.0</td>\n      <td>9.0</td>\n      <td>10.0</td>\n      <td>15.0</td>\n      <td>7.0</td>\n      <td>12.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>4.0</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 47 columns</p>\n</div>"
     },
     "execution_count": 247,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data = pd.read_csv(\"D:\\PycharmProjects\\pythonProject\\data\\\\train.csv\")\n",
    "test_data = pd.read_csv(\"D:\\PycharmProjects\\pythonProject\\data\\\\testA.csv\")\n",
    "train_data.head(5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:32.929667100Z",
     "start_time": "2024-09-27T12:57:30.653267200Z"
    }
   },
   "id": "45c3edf9d3acffd9"
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 数据处理"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "e8e33c8c6d09d83e"
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "outputs": [
    {
     "data": {
      "text/plain": "(1000000, 47)"
     },
     "execution_count": 248,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 由于等下需要对特征进行变化，因此我先将训练集和测试集堆叠在一起，一起处理才方便，再加入一列作为区分即可。\n",
    "target = train_data[\"isDefault\"]\n",
    "train_data[\"origin\"] = \"train\"\n",
    "test_data[\"origin\"] = \"test\"\n",
    "del train_data[\"isDefault\"]\n",
    "\n",
    "data = pd.concat([train_data, test_data], axis = 0, ignore_index = True)\n",
    "data.shape"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:33.088894500Z",
     "start_time": "2024-09-27T12:57:32.929667100Z"
    }
   },
   "id": "da7f9ee0dc218fae"
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1000000 entries, 0 to 999999\n",
      "Data columns (total 47 columns):\n",
      " #   Column              Non-Null Count    Dtype  \n",
      "---  ------              --------------    -----  \n",
      " 0   id                  1000000 non-null  int64  \n",
      " 1   loanAmnt            1000000 non-null  float64\n",
      " 2   term                1000000 non-null  int64  \n",
      " 3   interestRate        1000000 non-null  float64\n",
      " 4   installment         1000000 non-null  float64\n",
      " 5   grade               1000000 non-null  object \n",
      " 6   subGrade            1000000 non-null  object \n",
      " 7   employmentTitle     999999 non-null   float64\n",
      " 8   employmentLength    941459 non-null   object \n",
      " 9   homeOwnership       1000000 non-null  int64  \n",
      " 10  annualIncome        1000000 non-null  float64\n",
      " 11  verificationStatus  1000000 non-null  int64  \n",
      " 12  issueDate           1000000 non-null  object \n",
      " 13  purpose             1000000 non-null  int64  \n",
      " 14  postCode            999999 non-null   float64\n",
      " 15  regionCode          1000000 non-null  int64  \n",
      " 16  dti                 999700 non-null   float64\n",
      " 17  delinquency_2years  1000000 non-null  float64\n",
      " 18  ficoRangeLow        1000000 non-null  float64\n",
      " 19  ficoRangeHigh       1000000 non-null  float64\n",
      " 20  openAcc             1000000 non-null  float64\n",
      " 21  pubRec              1000000 non-null  float64\n",
      " 22  pubRecBankruptcies  999479 non-null   float64\n",
      " 23  revolBal            1000000 non-null  float64\n",
      " 24  revolUtil           999342 non-null   float64\n",
      " 25  totalAcc            1000000 non-null  float64\n",
      " 26  initialListStatus   1000000 non-null  int64  \n",
      " 27  applicationType     1000000 non-null  int64  \n",
      " 28  earliesCreditLine   1000000 non-null  object \n",
      " 29  title               999999 non-null   float64\n",
      " 30  policyCode          1000000 non-null  float64\n",
      " 31  n0                  949619 non-null   float64\n",
      " 32  n1                  949619 non-null   float64\n",
      " 33  n2                  949619 non-null   float64\n",
      " 34  n3                  949619 non-null   float64\n",
      " 35  n4                  958367 non-null   float64\n",
      " 36  n5                  949619 non-null   float64\n",
      " 37  n6                  949619 non-null   float64\n",
      " 38  n7                  949619 non-null   float64\n",
      " 39  n8                  949618 non-null   float64\n",
      " 40  n9                  949619 non-null   float64\n",
      " 41  n10                 958367 non-null   float64\n",
      " 42  n11                 912673 non-null   float64\n",
      " 43  n12                 949619 non-null   float64\n",
      " 44  n13                 949619 non-null   float64\n",
      " 45  n14                 949619 non-null   float64\n",
      " 46  origin              1000000 non-null  object \n",
      "dtypes: float64(33), int64(8), object(6)\n",
      "memory usage: 358.6+ MB\n"
     ]
    }
   ],
   "source": [
    "# 接下来就是对data进行处理，可以先看看其大致的信息：\n",
    "data.info()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:33.648722700Z",
     "start_time": "2024-09-27T12:57:33.088894500Z"
    }
   },
   "id": "f23880943d7d1f77"
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "outputs": [
    {
     "data": {
      "text/plain": "[Text(0, 0, 'employmentTitle'),\n Text(1, 0, 'postCode'),\n Text(2, 0, 'title'),\n Text(3, 0, 'dti'),\n Text(4, 0, 'pubRecBankruptcies'),\n Text(5, 0, 'revolUtil'),\n Text(6, 0, 'n10'),\n Text(7, 0, 'n4'),\n Text(8, 0, 'n12'),\n Text(9, 0, 'n9'),\n Text(10, 0, 'n7'),\n Text(11, 0, 'n6'),\n Text(12, 0, 'n3'),\n Text(13, 0, 'n13'),\n Text(14, 0, 'n2'),\n Text(15, 0, 'n1'),\n Text(16, 0, 'n0'),\n Text(17, 0, 'n5'),\n Text(18, 0, 'n14'),\n Text(19, 0, 'n8'),\n Text(20, 0, 'employmentLength'),\n Text(21, 0, 'n11')]"
     },
     "execution_count": 250,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": "<Figure size 640x480 with 1 Axes>",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAIGCAYAAAB3QNS5AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAABJ5UlEQVR4nO3deXhN5/738c9GEtoKkZAZiXNaMVUbqmjaOq2oeWxDS/VQbcrvVDmqhpZWKb+OjppKOTp4UENLiaLHUK20xqKG6imSIDGEihgyruePPtmP3YTaW5J9b96v69pXs++99td32ZV8cq+17mWzLMsSAACAwcq4uwEAAIA/Q2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABivnLsbKC75+fk6duyYKlasKJvN5u52AADANbAsS+fOnVNISIjKlLnyPMoNE1iOHTum8PBwd7cBAABckJKSorCwsCu+fsMElooVK0r6fYd9fX3d3A0AALgWGRkZCg8Pt/8cv5IbJrAUHAby9fUlsAAA4GH+7HQOTroFAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMF45dzcAAABKXs1hK67r/YcntC2mTlzDDAsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADCeS4Fl6tSpioiIUPny5RUdHa2NGzdedfsNGzYoOjpa5cuXV2RkpKZPn15om4kTJ+qOO+5QhQoVFB4erkGDBunSpUuutAcAAG4wTgeWBQsW6IUXXtDIkSO1Y8cOxcTEqHXr1kpOTi5y+0OHDqlNmzaKiYnRjh07NGLECD3//PNavHixfZu5c+dq2LBhGj16tPbt26dZs2ZpwYIFGj58uOt7BgAAbhg2y7IsZ97QpEkT3X333Zo2bZp9LCoqSp06ddL48eMLbf/SSy9p2bJl2rdvn30sPj5eO3fuVGJioiTpf/7nf7Rv3z795z//sW/zz3/+U5s3b/7T2ZsCGRkZqlSpks6ePStfX19ndgkAgBtezWErruv9hye0LaZOHF3rz2+nZliys7O1bds2xcbGOozHxsZq06ZNRb4nMTGx0PatWrXS1q1blZOTI0m67777tG3bNm3evFmSdPDgQSUkJKht2yv/5WRlZSkjI8PhAQAAbkzlnNn41KlTysvLU2BgoMN4YGCg0tLSinxPWlpakdvn5ubq1KlTCg4OVvfu3XXy5Endd999sixLubm5eu655zRs2LAr9jJ+/Hi99tprzrQPAAA8lEsn3dpsNofnlmUVGvuz7S8fX79+vcaNG6epU6dq+/btWrJkiZYvX67XX3/9ijWHDx+us2fP2h8pKSmu7AoAAPAATs2wBAQEqGzZsoVmU06cOFFoFqVAUFBQkduXK1dO/v7+kqRXXnlFvXr10tNPPy1Jql+/vs6fP69nnnlGI0eOVJkyhXOVj4+PfHx8nGkfAAB4KKdmWLy9vRUdHa01a9Y4jK9Zs0bNmjUr8j1NmzYttP3q1avVqFEjeXl5SZIuXLhQKJSULVtWlmXJyXOCAQDADcjpQ0KDBw/Whx9+qNmzZ2vfvn0aNGiQkpOTFR8fL+n3QzVPPvmkffv4+HglJSVp8ODB2rdvn2bPnq1Zs2ZpyJAh9m3at2+vadOmaf78+Tp06JDWrFmjV155RR06dFDZsmWLYTcBAIAnc+qQkCTFxcUpPT1dY8aMUWpqqurVq6eEhATVqFFDkpSamuqwJktERIQSEhI0aNAgTZkyRSEhIZo0aZK6du1q3+bll1+WzWbTyy+/rKNHj6pq1apq3769xo0bVwy7CAAAPJ3T67CYinVYAAC4sptqHRYAAAB3ILAAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPJcCy9SpUxUREaHy5csrOjpaGzduvOr2GzZsUHR0tMqXL6/IyEhNnz690Da//fabBgwYoODgYJUvX15RUVFKSEhwpT0AAHCDcTqwLFiwQC+88IJGjhypHTt2KCYmRq1bt1ZycnKR2x86dEht2rRRTEyMduzYoREjRuj555/X4sWL7dtkZ2erZcuWOnz4sBYtWqSff/5ZM2fOVGhoqOt7BgAAbhg2y7IsZ97QpEkT3X333Zo2bZp9LCoqSp06ddL48eMLbf/SSy9p2bJl2rdvn30sPj5eO3fuVGJioiRp+vTpeuutt7R//355eXm5tCMZGRmqVKmSzp49K19fX5dqAABwo6o5bMV1vf/whLbF1Imja/357dQMS3Z2trZt26bY2FiH8djYWG3atKnI9yQmJhbavlWrVtq6datycnIkScuWLVPTpk01YMAABQYGql69enrjjTeUl5d3xV6ysrKUkZHh8AAAADcmpwLLqVOnlJeXp8DAQIfxwMBApaWlFfmetLS0IrfPzc3VqVOnJEkHDx7UokWLlJeXp4SEBL388st65513NG7cuCv2Mn78eFWqVMn+CA8Pd2ZXAACAB3HppFubzebw3LKsQmN/tv3l4/n5+apWrZpmzJih6Ohode/eXSNHjnQ47PRHw4cP19mzZ+2PlJQUV3YFAAB4gHLObBwQEKCyZcsWmk05ceJEoVmUAkFBQUVuX65cOfn7+0uSgoOD5eXlpbJly9q3iYqKUlpamrKzs+Xt7V2oro+Pj3x8fJxpHwAAeCinZli8vb0VHR2tNWvWOIyvWbNGzZo1K/I9TZs2LbT96tWr1ahRI/sJts2bN9d///tf5efn27c5cOCAgoODiwwrAADg5uL0IaHBgwfrww8/1OzZs7Vv3z4NGjRIycnJio+Pl/T7oZonn3zSvn18fLySkpI0ePBg7du3T7Nnz9asWbM0ZMgQ+zbPPfec0tPTNXDgQB04cEArVqzQG2+8oQEDBhTDLgIAAE/n1CEhSYqLi1N6errGjBmj1NRU1atXTwkJCapRo4YkKTU11WFNloiICCUkJGjQoEGaMmWKQkJCNGnSJHXt2tW+TXh4uFavXq1BgwapQYMGCg0N1cCBA/XSSy8Vwy4CAABP5/Q6LKZiHRYAwI2iJNZMuanWYQEAAHAHAgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHjl3N0AANfUHLbiut5/eELbYq3nKTU9oceSqPnHeiVRk/0uvpoojBkWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADBeOXc3AJim5rAV113j8IS2JV4TAG4mzLAAAADjEVgAAIDxXAosU6dOVUREhMqXL6/o6Ght3Ljxqttv2LBB0dHRKl++vCIjIzV9+vQrbjt//nzZbDZ16tTJldYAAMANyOnAsmDBAr3wwgsaOXKkduzYoZiYGLVu3VrJyclFbn/o0CG1adNGMTEx2rFjh0aMGKHnn39eixcvLrRtUlKShgwZopiYGOf3BAAA3LCcDizvvvuu+vbtq6efflpRUVGaOHGiwsPDNW3atCK3nz59uqpXr66JEycqKipKTz/9tPr06aO3337bYbu8vDw98cQTeu211xQZGena3gAAgBuSU4ElOztb27ZtU2xsrMN4bGysNm3aVOR7EhMTC23fqlUrbd26VTk5OfaxMWPGqGrVqurbt+819ZKVlaWMjAyHBwAAuDE5FVhOnTqlvLw8BQYGOowHBgYqLS2tyPekpaUVuX1ubq5OnTolSfruu+80a9YszZw585p7GT9+vCpVqmR/hIeHO7MrAADAg7h00q3NZnN4bllWobE/275g/Ny5c+rZs6dmzpypgICAa+5h+PDhOnv2rP2RkpLixB4AAABP4tTCcQEBASpbtmyh2ZQTJ04UmkUpEBQUVOT25cqVk7+/v/bs2aPDhw+rffv29tfz8/N/b65cOf3888+qVatWobo+Pj7y8fFxpn0AAOChnJph8fb2VnR0tNasWeMwvmbNGjVr1qzI9zRt2rTQ9qtXr1ajRo3k5eWl2rVra/fu3frxxx/tjw4dOqhFixb68ccfOdQDAACcX5p/8ODB6tWrlxo1aqSmTZtqxowZSk5OVnx8vKTfD9UcPXpUH3/8sSQpPj5ekydP1uDBg9WvXz8lJiZq1qxZmjdvniSpfPnyqlevnsOfUblyZUkqNA4AAG5OTgeWuLg4paena8yYMUpNTVW9evWUkJCgGjVqSJJSU1Md1mSJiIhQQkKCBg0apClTpigkJESTJk1S165di28vAADADc2lmx/2799f/fv3L/K1OXPmFBp74IEHtH379muuX1QNAABw8+JeQgAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIznUmCZOnWqIiIiVL58eUVHR2vjxo1X3X7Dhg2Kjo5W+fLlFRkZqenTpzu8PnPmTMXExMjPz09+fn56+OGHtXnzZldaAwAANyCnA8uCBQv0wgsvaOTIkdqxY4diYmLUunVrJScnF7n9oUOH1KZNG8XExGjHjh0aMWKEnn/+eS1evNi+zfr169WjRw+tW7dOiYmJql69umJjY3X06FHX9wwAANwwnA4s7777rvr27aunn35aUVFRmjhxosLDwzVt2rQit58+fbqqV6+uiRMnKioqSk8//bT69Omjt99+277N3Llz1b9/fzVs2FC1a9fWzJkzlZ+fr//85z+u7xkAALhhOBVYsrOztW3bNsXGxjqMx8bGatOmTUW+JzExsdD2rVq10tatW5WTk1Pkey5cuKCcnBxVqVLFmfYAAMANqpwzG586dUp5eXkKDAx0GA8MDFRaWlqR70lLSyty+9zcXJ06dUrBwcGF3jNs2DCFhobq4YcfvmIvWVlZysrKsj/PyMhwZlcAAIAHcemkW5vN5vDcsqxCY3+2fVHjkvTmm29q3rx5WrJkicqXL3/FmuPHj1elSpXsj/DwcGd2AQAAeBCnAktAQIDKli1baDblxIkThWZRCgQFBRW5fbly5eTv7+8w/vbbb+uNN97Q6tWr1aBBg6v2Mnz4cJ09e9b+SElJcWZXAACAB3EqsHh7eys6Olpr1qxxGF+zZo2aNWtW5HuaNm1aaPvVq1erUaNG8vLyso+99dZbev311/XVV1+pUaNGf9qLj4+PfH19HR4AAODG5PQhocGDB+vDDz/U7NmztW/fPg0aNEjJycmKj4+X9PvMx5NPPmnfPj4+XklJSRo8eLD27dun2bNna9asWRoyZIh9mzfffFMvv/yyZs+erZo1ayotLU1paWnKzMwshl0EAACezqmTbiUpLi5O6enpGjNmjFJTU1WvXj0lJCSoRo0akqTU1FSHNVkiIiKUkJCgQYMGacqUKQoJCdGkSZPUtWtX+zZTp05Vdna2unXr5vBnjR49Wq+++qqLuwYAAG4UTgcWSerfv7/69+9f5Gtz5swpNPbAAw9o+/btV6x3+PBhV9oAAAA3Ce4lBAAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDyXAsvUqVMVERGh8uXLKzo6Whs3brzq9hs2bFB0dLTKly+vyMhITZ8+vdA2ixcvVp06deTj46M6dero888/d6U1AABwA3I6sCxYsEAvvPCCRo4cqR07digmJkatW7dWcnJykdsfOnRIbdq0UUxMjHbs2KERI0bo+eef1+LFi+3bJCYmKi4uTr169dLOnTvVq1cvPfbYY/rhhx9c3zMAAHDDKOfsG95991317dtXTz/9tCRp4sSJWrVqlaZNm6bx48cX2n769OmqXr26Jk6cKEmKiorS1q1b9fbbb6tr1672Gi1bttTw4cMlScOHD9eGDRs0ceJEzZs3z9V9g4FqDltx3TUOT2hbrDX/WA8AYB6nAkt2dra2bdumYcOGOYzHxsZq06ZNRb4nMTFRsbGxDmOtWrXSrFmzlJOTIy8vLyUmJmrQoEGFtikIOUXJyspSVlaW/fnZs2clSRkZGc7sEq6i3uhV113jp9daOTzPz7pw3TX/+Blfb83irucpNT2hx5Ko6Qk9lkTNor433gx/l+x3ydYsDgV1Lcu6+oaWE44ePWpJsr777juH8XHjxlm33357ke/561//ao0bN85h7LvvvrMkWceOHbMsy7K8vLysuXPnOmwzd+5cy9vb+4q9jB492pLEgwcPHjx48LgBHikpKVfNIE4fEpIkm83m8NyyrEJjf7b9H8edrTl8+HANHjzY/jw/P1+nT5+Wv7//Vd9XEjIyMhQeHq6UlBT5+voaV89TanpCjyVR0xN6LImantBjSdT0hB5LoqYn9OgpNT2hR2dYlqVz584pJCTkqts5FVgCAgJUtmxZpaWlOYyfOHFCgYGBRb4nKCioyO3LlSsnf3//q25zpZqS5OPjIx8fH4exypUrX+uulAhfX99i/aCLu56n1PSEHkuipif0WBI1PaHHkqjpCT2WRE1P6NFTanpCj9eqUqVKf7qNU1cJeXt7Kzo6WmvWrHEYX7NmjZo1a1bke5o2bVpo+9WrV6tRo0by8vK66jZXqgkAAG4uTh8SGjx4sHr16qVGjRqpadOmmjFjhpKTkxUfHy/p90M1R48e1ccffyxJio+P1+TJkzV48GD169dPiYmJmjVrlsPVPwMHDtT999+v//3f/1XHjh21dOlSff311/r222+LaTcBAIAnczqwxMXFKT09XWPGjFFqaqrq1aunhIQE1ahRQ5KUmprqsCZLRESEEhISNGjQIE2ZMkUhISGaNGmS/ZJmSWrWrJnmz5+vl19+Wa+88opq1aqlBQsWqEmTJsWwiyXPx8dHo0ePLnSIypR6nlLTE3osiZqe0GNJ1PSEHkuipif0WBI1PaFHT6npCT2WBJtl/dl1RAAAAO7FvYQAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwHIdsrOztWHDBs2fP1+SdO7cOZ07d+666+bn5+vYsWPXXadAUlKSEhMTi61ecTt9+rQuXPj9plx5eXmaO3eu5s+f/+c3wipFKSkpWrVqlRYvXqxVq1YpJSWl2P+MS5cuFXtNmOns2bPav3+/8vLy3N0KivDRRx/x79FAXNbsom3btqlz584KDAzU/v37de7cOa1atUozZ87UokWLXKp56tQpxcfHa9myZfL29lZmZqaWLVumTZs2acKECU7XO3jwoOLi4pSUlKSLFy/q3LlzWrx4sZYuXWpf2O9abNu2TdHR0ZKkzZs3X3G7e+65x+keJalRo0b68MMP1bBhQ7344otatWqVvLy8FBMTc9U7dl/uzTffvKbthg4d6lRvycnJ6tWrl7Zu3apatWrJ19dXGRkZ+vXXX9W4cWN98sknCg8Pd6rmlYSEhGjnzp2qWrWq0+89e/ashgwZos2bN6tu3boaNWqUateubX+9oG9nrFmzRh9//LH27NmjCxcuKDQ0VI0aNdKgQYMUFBTkdI8dO3ZU586d1bVrV1WsWNHp9xdl0aJFio6OVkREhH777Te98MILWrlypSSpXbt2evfdd69pye8ClmVpxowZOnTokPr06SM/Pz8NHDhQBw8e1N/+9je9+uqr8vb2dqrHn3/+Wb1799bhw4c1bNgw3XHHHerTp48yMzPl5+enhIQE1atXz6makvTtt99qy5Ytqlu3rmJjYx1e69+/v6ZOnep0zaLk5ubqmWee0ezZs51+75w5c/Tpp59qz549yszM1G233aa6deuqV69e6t27t9P1jhw5orCwMPvzpUuX6vPPP5cktW/f3mF9r2txpe9nbdu21aeffio/Pz+nv69dvHhREyZM0N69exUbG6u+ffvqhRde0Lp161S/fn29++67Lv372bVrlwYPHqxdu3YpMzNT0v+/517BL3zFISsrS7fccouRYZrA4qLGjRvrlVdeUYcOHeTn56czZ87o4sWLioyMVGpqqks1O3XqpNq1a+vll19WeHi4zpw5o9OnT6tRo0Y6ePCg0/X+9re/qWvXrhowYIC9x3PnzqlOnTpOzRDUq1dPP/30k6TfFwIsis1mc6lH6fd7SPz222+y2WwKDAzU9u3bdeuttyoqKuqa/y7//ve//+k2NpvN6W+6LVq0UJMmTTRq1Cjdcsst9vHz589r7NixSkxM1Pr1652qWadOnSLHf/nlF0VGRqps2bLau3evUzWffPJJZWVlqV+/fvruu+80efJkzZ49W+3bt5ckVaxY0anZv3feeUcffPCB+vTpI8uyNGfOHD322GOSfv8htGDBAqdvneHt7a369etr3759ateunXr27KnWrVvbb9HhiurVq2vnzp3y8/NT7969lZOTo1deeUX5+fn2kP/JJ59cc70hQ4Zo586dkqSffvpJ8fHxqlevnr3eQw89dM3huEDLli3VuXNn2Ww2Pf/885o+fbr69u2r/Px8jRgxQrt27VJCQoJTNT/44AONHj1aDz/8sDZv3qywsDAtWrRIVapUkeRaQL0SV3+AvfTSS1q+fLmGDBmiO++8097Tjz/+qPfee0/t2rXT+PHjnap5+X598MEHGjNmjJ5//nmVKVNGkyZN0vDhw9W/f/9rrlemTBkFBwfLx8fHYUY3JSVFISEhKleunNPf13r37q20tDR16tRJixcvtt83r0+fPpozZ44yMzPtIcsZ9evXV6dOnfT44487fC+SZF+49VqdOHHiiq9dunRJNWvWVH5+vtM9lrir3ssZV+Tn52fl5+fbv7Ysy8rLy7OqVKnick1/f38rNzfXoaZlWValSpVcqle5cuVCPebn5zvUNkFAQICVmZlpbd682WrYsKFlWZaVm5trVaxY0c2dWdatt95qZWVlFflaVlaWdcsttzhds169elZMTIyVkJBgrV+/3lq/fr21bt06q0qVKtaSJUus9evXO12zWrVq1oULF+zPd+zYYYWHh1sffvihZVmW03+XQUFBDrd6P3TokFWnTh3Lsixr9erVVnR0tNM9FvSwd+9e6+WXX7YiIyOtKlWqWM8884z1zTffOF3Psizrtttus/Ly8izLsqyQkBCHv4MLFy5YAQEBTtULCQmxfvvtN+v06dOWzWazDh8+bH/tl19+sWrUqOF0jwXfE3Jzcy0vLy/r0qVL9tcyMzOd7tGyLKtWrVrWnj17LMv6/fvOsGHDrNq1a1vJycmWZf3+9+KM1q1bX/ERGxtrlSlTxuke/f39rbS0tCJfS01Ndel75eX7VbduXeuHH36wP9+2bZtVu3Ztp+qNHz/eatCggfXFF184jAcFBVnHjx93uj/LsqyqVatamZmZlmVZ1pkzZ6xy5cpZ586dsyzL9c/bsizL19fX/v38etlsNqtMmTKWzWYr8uHK510aOIfFRfXr19eXX37pMLZixQrdddddLtcMCgrSgQMHHMZ2796tmjVrulSvVq1a+v777x3GEhMTdccdd7jaojp27FjkeJcuXVyu2aNHD7Vo0UK9evXSU089JUnasWOHU/u9bds2+9ebN2++4sNZf/nLXxzue3W5+fPn6y9/+YvTNX/88Ud16dJFL774oo4ePaoHHnhADz74oLy9vdW8eXM98MADTtfMz893+A24YcOG2rBhgyZMmKCxY8e6VM/Pz8/+vGCGTpIeeugh7d+/3+maBaKiovT666/r119/1ZdffikvLy917drV6d8Spd9n/1avXm3v8fJzv9LS0pyevTl//rwqVaokPz8/+fr6OvT0l7/8Renp6U73aLPZJElly5ZVVFSUw9Ln3t7eys7OdrrmiRMn7If8ypQpo/Hjx2vgwIG677779NNPP9n/zGv1zTff6L777lNcXFyhR7du3Zzur6CvK50HcunSJZUp4/yPn8v36/jx4w6Ha+6++24dPXrUqXrDhg3TypUrNX/+fMXGxmrfvn1O91SU3Nxc+3/z8/Pt++rKPhd44okn9MUXXxRHewoJCVFiYqLy8/MLPYrz8FKxc3di8lS7du2ywsLCrLZt21o+Pj5W165drRo1ali7d+92uebChQutGjVqWG+99ZZ12223We+//751++23W59//rlL9davX29Vq1bNGjBggFWhQgXrn//8pxUWFubyb7OWdeXf1K9nZik/P9/66quvrP/85z/2sa1btzo8/zN169a1f12zZs0iHxEREU73tnnzZis8PNyqU6eOFRcXZ/Xr18+Ki4uzoqKirOrVq1tbt251umaB48ePW0899ZTVvHlza+vWrVZwcLDLv9W1a9fO+uSTTwqNp6amWg0aNHD6N6ZnnnnGatWqlfXVV19ZK1eutFq1amU988wzlmVZVlpamhUeHu50j1f7rT83N9dasWKF0zUTExOt0NBQ6+WXX7bP2rz++uvW66+/bkVGRlrvv/++U/Vq165tnTp1yrIsy/r2228dXktOTrbCwsKc7vG+++6z9u/fX+Rr33zzjXXXXXc5XbNhw4YOswsF5s+fb1WrVs3y9vZ2qt79999vzZs3r8jXLl26ZNlsNqd7HD9+vFWzZk3rzTfftFauXGl999131sqVK60333zTioiIsCZMmOB0zXLlylmtW7e2HnnkEcvX19dKSkqyv3by5EmrWrVqTtcssG7dOuuuu+6ynn/+ecvf39/lf4t9+vSxmjRpYr366qtWixYtrM6dO1u9evWy1q5daz311FNWt27drrnWI4884jDT5eXlZUVHRxeaBXNW165drUmTJhX5WlZWllWzZk2na5YGzmG5DpmZmVqxYoWSk5MVGhqqdu3aydfX97pqbt68Wf/+97/tNfv06aN7773X5XrJycmaN2+evd7jjz/u0oxNwXHhWbNmqW/fvg6vJSUl6cyZM9q0aZPLfWZnZysxMVGpqanq3r27MjIyZLPZiu3kzOuRnZ2t9evXa+/evfYTB+vUqWOfFbleiYmJGjhwoLZv365jx46pWrVqTtc4cOCAzp49q8aNGxd6LSMjQ1988YWefPLJa6538eJFjR07VqtWrZIkxcbG6uWXX9Ytt9yio0eP6pdfftGDDz7oVI/PPfecpk2b5tR7rkVaWpree+89bdmyRUePHlWFChVUr149PfXUU3r44YedqrVs2TI1btxYwcHBhV5btGiRfvnlFw0fPtypmufPn5ePj4/KlSt8r9ldu3YpJyfHflL7tfroo4+UnZ2tfv36FXpt9erVmjBhgtauXXvN9TZu3KhbbrmlyD4sy9I333zj0sxfQkKCPv3000L/dnr27Kk2bdo4Xe+jjz5yeP7ggw/aZ8FWrVqlNWvW6O2333a6boG8vDxNmTJF69at00cffeTS9/Ps7GxNmjRJycnJ6t27t2rVqqXnnntOu3fvVqNGjfTWW29d84n1f9zfK3H2BOaC2diyZcs69T53I7Dgmrz22muSpDfeeEMjR460n6Bms9lUrVo1devWTQEBAS7VLu4rrjp27KilS5cWGu/SpYuWLFniUo8lzbIsZWRkOHVFC4Cbx8KFC/Xoo48WGl+0aJHLh+08DYHFCVFRUdd0bNiZKzyu9Yz2a71EsXXr1tfUozNXJXz22Wf2r63/dxldUQquInFWcV9xdaUrJPz9/V06B+FKrudyzwLJycn66aef7JcpFnD177Io19tnSfToCTU94bORPGO/PaXm5Uz7bK70fa1KlSo6ffq0yz3+UXHsd0khsDhhw4YN17SdM1OnBTMX0u8n0n366afq0KGDwsPDlZKSoi+//FI9e/bU5MmTr6leSUwhtmjRwv61ZVnatGmTgoODFRoaqiNHjigtLU3NmjVz+vLeAlWqVFF6erpsNpv9H19+fr6qVq3qVMAo6cNWf3S96xW8+eabevXVV1W/fn2HyxRtNptT0/l/5nr6LIkePaGmJ3w2kmfst6fU/CNTPpuCiwVatGih9evXO1x+fejQIQ0ZMqRYF7I0eR2WwgdVcUWXB5GrTc85Y/To0favH3roIX311Vdq2rSpfez777/XyJEjr7ne5UHkhx9+UJMmTQpt4+zVMuvWrbN/HR8fr27duul//ud/7GNTpkyxr9PiioIrrjp06GAfc+WKq8DAwCK/ttlsatCggUvTplc7zn69/6Dffvtt+8Jf16uk+izOHj2ppid8NpLZ++0JNT3hs4mLi5P0+5VVl8/MFKxbNWnSJKdrluR+lyRmWFxUEtNzlStX1smTJx0uxczKylK1atV09uxZY3pMT093OFkrNzdXAQEB+u2331yquXv3brVp00Z33nmnvv76a7Vr105bt27V8uXLXVoBdMOGDS6dIFiU2267TSNGjFBoaGih17KzsxUfH+/yP/DIyEjt3btX5cuXv942S6zP4uzRk2p6wmcjmb3fnlDTkz6bfv36aebMmcVSqyT3uyQRWJxUktNzrVu3VkhIiMaOHavg4GAdO3ZMo0ePVkpKir766qtrrlOwimFkZKQOHTpUqMeOHTsqLS3NpR7vvPNODRw4UH369LGP/fvf/9a7776r3bt3u1RTKp4rroq63YCXl5eqV6+ue+65x6UVVR944AE999xz6t69e6HXsrKyVKFCBZdXhJw5c6Y2bNig4cOHF7pqwNkrhUqqz+Ls0ZNqesJnU9x9lkQ902t60mdTnEpyv0sSgcVJBUvTJycnq3r16vbxgum5oUOHqnPnzi7VPnXqlPr3768vvvhClmWpTJky6tSpk95//32n/gcvU6aMbDZbkTcPDAwM1KhRo/Tcc8+51OOWLVvUtWtXlS9f3n6ezaVLl7R48eIiL6ktTa1bty40lpOTo8OHD8uyLC1fvlxRUVFO1Sypyz2lKy8iZbPZnP7tpqT6LM4ePammJ3w2xd1nSdQzvaYnfTYVKlQo8oIHHx8fhYWFqXPnzho+fLgqVKjwp7VKcr9LEoHFRcU5PfdH+fn5OnnypKpWrXpdKyO2atXKvo5GccrJybGvmRIcHKymTZs6PXvRuHFjbdmyRdLVr75y9p46VzJp0iStWLGiRP4+AKCkTZ48WcuXL9fQoUMVFhamlJQUvfPOO4qNjVWdOnU0ZswY1a1bVx988IG7Wy0xBBbDJCUladGiRTp27JhCQkLUtWtXl5fmN9m3336r++67T9LVr74qrpSfk5OjkJAQnTx50uUapXGZa3HwlD5vRnw25jL9s4mMjNTOnTsdFtPMyMjQnXfeqUOHDiktLU0NGzZ0+nC/6ft9OQKLE6pVq2Y/P6So6TnrOm/1vXz5cj3xxBNq27atatSooeTkZK1YsUKffPKJ/a67f6a0Zy48RUZGhmrXru1wrxlnlNZlrtfLU/q8GfHZmMsTPpvAwED98MMPDr/AHjp0SE2aNNGJEyeUk5OjatWq2e/5dS08Yb8vR2BxQkpKimw2m8LCwpSUlHTF7Vy5iZv0++W977//vsOS5998842ee+457dmz55pqXD5z8dlnnzlc3lvg5MmTRq2M2Lt3bz377LNq1qyZfWzTpk2aOXOm/v3vf193/WPHjmnEiBGy2Wwu16tWrZrWrVtXrJdmlgRP6fNmxGdjLk/4bF599VV99NFHeuaZZxQeHq4jR45oxowZeuqppzRq1Ch9/vnnev/9950KGp6w35cjsDjpSpcKF4cqVaro+PHjDueDuJKaC5TWyojXy9/fXydOnCh0qXRQUJBOnTp1zXWKmvUqOLmtS5cumjp1qsMdiJ1REpdmlgRP6fNmxGdjLk/5bJYtW6bPP/9caWlpCgoKUpcuXa559r0onrLfBQgsTqpYsaLOnTtXIrVbt26tu+++W6+++qq8vLyUk5Oj1157TVu3bnXqsuarXXp9+PBh/fOf/yzWlRGvV1hYmHbv3u0QJk6fPq26des6tTR/UbNe5cqVU3Bw8HWdvCyZfYni5Tylz5sRn425btbPxtP2m8DipIoVK2rv3r1FXjJc4PLLnZ2RkpKi7t27a9euXfbzZerXr6/58+c7VbMkL70uCf3799eRI0f0/vvvq0aNGkpKStLAgQMVHBxcInf3dUVJXJpZEjylz5sRn425POGzyc3N1YIFC7Rz585CJ8he673m/sgT9vtyBBYnlSlTRuXLl79iYLmek24LpKSk2K8SCg8Pd7lOSV56XZwuXLiggQMH6tNPP1VOTo68vb315JNP6p133tGtt956TTVK4saUAGCKJ554Qjt37lTbtm0dTpCVHG/xciMjsDipJA4JJScna+3atXrqqacKvTZnzhw99NBD1xVcPIVlWfb1Z64lfFyuJG5MCQCmqFy5spKTk51eAfxGws0PneTsD9JrMXr0aDVv3rzI1/Ly8jRq1KhiuVrGVAUnAdtsNofjppdfRv5nCCIAbmR//etfde7cuZs6sDDD4qSSmGGpWbOm9u7dW2iaT5IuXryo2rVrX/Uyak9X1N/ppUuXFBYW5tRVQgVyc3P15ptvat68efZDaz169NCLL77o0v2EAMDdXn/9dS1atEj9+vUrdEKsiYu8lQRmWJxU8IO1Y8eOWrp0aaHXu3TpoiVLljhV89SpUypXruiPomzZskpPT3e+UQ9QcN7JxYsXVadOHYfXTpw4oXbt2rlU98UXX9SWLVv0zjvv2E/iHTt2rE6ePKn33nuvOFoHgFK1du1aValSRYsXL3YYt9lsN01gYYbFRVda48Tf39/pgNGgQQO99dZbatWqVaHXVq1apRdffFG7du1yuVdTbdiwQZZlqU2bNlq5cqV9vODQUO3atV2qGxoaqp9++snhMun09HTVr1/f5ZVuAQDuxQyLk/r37y/p91twF3xdICkpSXfccYfTNf/xj3+oT58+mj59utq2basyZcooPz9fK1asUP/+/W/YM8ALzjvJzMy87nVSLle2bFllZWU5jGVnZxfrnwEApW337t1asmSJTpw4oSlTpmj//v26ePGi7rrrLne3Vir4Du6kwMBA+3L3BV8HBgYqKChI7du317Jly5yu2a9fPz333HPq3r27ypcvr5CQEJUvX17du3dXfHy8nn766eLeDaNcunRJQ4cOVa1atXTLLbcoMjJSQ4cO1fnz512q17t3b7Vq1Upz587Vxo0bNXfuXLVp00Z///vfi7lzACgdH3/8sVq3bq3MzEx9+umnkn4/x3HQoEFu7qz0cEjIRRs2bCj2K1POnj2rxMREpaeny9/fX02bNlWlSpWK9c8wUffu3fXbb7/plVdeUXh4uJKTk/XGG2+oYsWKWrBggdP18vPzNXPmTH322Wf2k24fffRRPfPMM8yyAPBIf/nLX5SQkKDbb79dfn5+OnPmjEu3MPFkBJbrUNy35S7OE3k9SeXKlXXs2DGHq6QyMzMVHh7u0j2U8vLyHO5LBACeLjAwUEePHlW5cuXsS0FcvHhRkZGRTt3CxJNxDouLrnZbblcDy7p164ocv9ZF0TxVvXr19Msvv+jOO++0jx08eNDlO4hWq1ZNXbp0Uffu3dWiRQtmVQB4vIcfflijRo3SuHHj7GMTJkwo8mKNGxUzLC4qzttyF5y8O2vWLPXt29fhtaSkJJ05c0abNm267j/HVAMHDtT/+T//R127dlVYWJiOHDmiJUuW6PHHH1dISIh9u6FDh15TvYMHD2rBggX2Q0Jdu3ZVXFwci8sB8FhnzpxRr169tHHjRp0/f15+fn5q3Lix5s6d6/Jd6D0NgcVFxXlb7tdee02S9MYbb2jEiBH28YLLe7t166aAgIDr/nNMdS0nw9psNs2ePdvp2r/88osWLFigBQsW6PTp0zp69KgrLQKAEY4fP67k5GSFhoYqODhYn3/+ubp06eLutkoFgcVFJXFb7pI4kfdm98033+izzz7T4sWLVbVq1RtyPRsAN6esrCzdcsstRt5ZuSQQWFxUErflXrBgge68807Vrl1bBw4c0IABA1S2bFn961//cml9F0/x2WefXfE1V84H+vbbb/XZZ59p0aJFqly5suLi4vTYY48pKirqetoEAKNkZWWpQoUKys/Pd3crpYLAYpAaNWpoy5Ytqlatmh555BE1bNhQt912m77++mutX7/e3e2VmBYtWjg8T0tL06+//qrmzZtf8UTkq/nrX/+qxx57THFxcWrQoEFxtQkARmGGBdcsOztbiYmJSk1NVffu3e33GapYsaJL9QqW+z9//rxCQ0N18uRJlS1bVv7+/i5d3uvJPv74Y+3YsYN7/wC4qW3evPmKr2VlZenBBx8ksODqtm3bps6dOyswMFD79+/XuXPntGrVKs2cOVOLFi1yqWbt2rU1Z84c7d69W0uXLtXy5ct17tw51axZ84a9AeKV5OfnuxzULl68qFGjRmnhwoU6ffq0MjIytGrVKu3bt08vvPBC8TcLACUkIiLiT7c5dOhQKXTifqzD4qL4+HhNnjxZHTp0sF9Sdv/99+upp55yuebbb7+tzp07y9vb2x56li9frsaNGxdHy8Y6ceKEw/MLFy5o7ty5CgoKcqle//79lZOTo+XLlysmJkbS7zeYHDhwIIEFgEe5WcLItWCGxUVVqlRRenq6bDabfdXB/Px8Va1atVhnQ3JyciRJXl5exVbTNGXKlJHNZlPB/4q33HKL7rrrLr333ntq1KiR0/WqVaumlJQU+fj42D8bSapUqZLOnj1brL0DQGm4WVdCvxwzLC6qX7++vvzyS3Xo0ME+tmLFiuu+a+amTZs0b948+z1wevTooWbNml1vu0Yr7jPcK1eurJMnTyosLMw+dujQIYdF6ADAk9ysK6FfjsDiosmTJ6tNmzaaMWOGLly4oG7dumnr1q1avny5yzVnzZqlESNGqG/fvoqNjVVycrK6dOmisWPH3rB3bM7NzZWvr69+++03eXt7F0vNgQMHqn379ho5cqTy8vK0fPlyjR07lsNBADxOwUroWVlZ9q8LJCUl3dBLXvwRh4SuQ2ZmplasWGFfdbBdu3by9fV1ud7tt9+uxYsXq379+vaxn376SZ06ddJ///vf4mjZSNHR0VqyZIlq1KhRbDUXLlyo2bNnKzk5WWFhYerTp4/i4uKKrT4AlIabfSX0yxFYDFK1alUdOXJEPj4+9rFLly4pLCzshr59+OjRo7VgwQL169dPYWFhstls9tecXTguNzdXrVu31ooVK4ptxgYA3I2V0AksLtu3b59Gjhyp3bt36/z585Iky7Jks9l07Ngxl2r26NFDXl5emjBhgkJCQnT06FGNHDlSFy9e1IIFC4qzfaP8ceG4AjabTWvXrnW6XkREhPbs2eNwF20A8HTJycn66aeflJmZ6TDuyorgnojA4qK6deuqbdu26tKliypUqODw2p133ulSzTNnzqh///5asmSJ8vPzZbPZ1LVrV02ePFn+/v7F0bZHyc/Pv+ItEK5mypQpWrt2rUaOHKnQ0FCHGRtX7/MEAO705ptv6tVXX1X9+vUdfhlz9Rc7T0RgcZG/v79Onjzp0g/UP5Ofn6+TJ0+qatWqJVLfNKNGjdKYMWMcxnJyctS9e3ctXrzY6XolcZ8nAHCnatWqad26dapbt667W3GbG/+nYQkZPHiwJk2aVOx19+zZo3Hjxum1117T2LFjtWfPnmL/M0yzYcMGjR071v784sWLatu2baGZq2uVn59f5IOwAsBT3XbbbapVq5a723ArAouLunTpon/961+qVKmSIiMjHR6umjFjhpo3b67k5GQFBQUpJSVFMTExmjFjRjF2bp7ly5crISFBb731ljIyMtSyZUvVqlVLn3zyibtbAwAjDB8+XE8//bT27NmjEydOODxuFhwSclFUVJRatmxZ5DksTZo0calmeHi4li5dqrvvvts+tmPHDrVr105Hjx69rn5Nl5GRoYceekgnTpxQjx49NGHCBHe3BADG4FA3gcVlfn5+Sk9PL9ZzTMLDw3XgwAGHAHT+/HnVrl1bKSkpxfbnmOCPCyBJUnp6utavX6+uXbvax6ZOnVqabQEADMVKty76xz/+oWnTpmnAgAHFVnPo0KHq3r27RowYodDQUB05ckT/+7//q2HDhjlM+90IV7oEBgYWOVanTh03dAMA5svOzlZiYqJSU1PVvXt3nTt3TpJUsWJFN3dWOphhcVFUVJT++9//ys/Pr9Aqg3v37nWp5rXM1txM038AgN9t27ZNnTt3VmBgoPbv369z585p1apVmjlzphYtWuTu9koFgcVFV7vh1M2+GqErvvvuO+3cubPQgkhDhw51U0cAYI7GjRvrlVdeUYcOHeTn56czZ87o4sWLioyMVGpqqrvbKxUcEnJRpUqV1LBhQ3e3cUMYOHCg5s+frwcffLDQgkgAAOnXX39V+/btJf3/740+Pj7Kzs52Z1ulisDiog4dOui2225Tjx499Pjjj9/018dfj08//VS7du1SaGiou1sBACPVr19fX375pTp06GAfW7Fihe666y43dlW6CCwuSk5O1saNGzVv3jw1bdpUERERevzxxxUXF6egoCB3t+dRAgMDdeutt7q7DQAw1uTJk9WmTRvNmDFDFy5cULdu3bR161YtX77c3a2VGs5hKQZ5eXlatWqVXnzxRR04cEAPPvig+vTpo7i4uJtiaf3r9a9//UtffvmlXnjhhUJXQN1zzz1u6goAzJKZmakVK1YoOTlZoaGhateunXx9fd3dVqkhsFynnTt3av78+Zo3b578/PzUs2dPhYaGatq0aapUqZKWLVvm7haNFxERUeS4zWbTwYMHS7kbAICJCCwuGjNmjObNm6dLly6pR48e6tmzp8MaIhcvXpS/v78uXLjgxi4BADeCffv2aeTIkdq9e7fOnz8vSbIsSzabTceOHXNzd6WDc1hcdOTIEX3wwQe6//77i3y9QoUK2rRpUyl35dkuXLig9PR0XZ6hq1ev7saOAMAM3bp1U9u2bTV06FCXbwzr6ZhhuU7p6ek6duyYQkJC5O/v7+52PNLu3bv15JNPateuXZL+/yV73t7ezFABgCR/f3+dPHnypj4v8ubd8+t0/PhxtWzZUsHBwYqNjVVwcLBatmx50yzgU5zi4+PVsWNHnT9/Xr6+vsrMzNTgwYP13nvvubs1ADDC4MGDNWnSJHe34VbMsLioY8eOqlmzpt544w3deuutOn/+vEaOHKmDBw9yoq2TKleurNOnT6tMmTL2FRyzs7MVGRmpI0eOuLs9AHC7ffv2qU2bNjp9+nSh2fyb5eIEAouLAgIClJqaKi8vL/tYVlaWQkJClJ6e7sbOPE/NmjW1fft2ValSRfXq1dPcuXNVpUoV1a9fX7/99pu72wMAt4uKilLLli3VpUuXQuewNGnSxE1dlS5OunVR1apVtX37dof/UXbu3KmqVau6sSvP9PTTT2vDhg3q3LmzBg4cqJiYGJUpU0bPPPOMu1sDACOkpaVp4sSJN/U5LMywuGjhwoXq16+fHn30UVWvXl1JSUlasmSJpk+frscee8zd7Xm0pKQkZWZmqm7duu5uBQCMMGrUKAUGBmrAgAHubsVtCCzXYd++fVq0aJH9KqGuXbs6rMUC15w+fVpVqlRxdxsAYIyoqCj997//lZ+fnwICAhxe27t3r5u6Kl0EFrhNZmamJkyYoH379unee+9Vnz599Mgjj2jbtm0KDAzUF198cdMcmwWAq9mwYcMVX3vggQdKsRP3IbA4oXXr1vY1Qq4mISGhFLrxfE888YROnz6tDh06aNmyZTpy5IjatWunXr16ae7cudqwYYO+/fZbd7cJAG73448/qmHDhu5uw60ILE746KOPrmm73r17l3AnN4aqVavq8OHDuvXWW3X27Fn5+/vr4sWL8vLyUk5OjqpVq6YzZ864u00AcLvq1avrtttuU48ePfT444+rVq1a7m6p1BFY4Da+vr7KyMiwP69SpYpOnz59xdcB4Ga2ceNGzZs3T4sWLVJERIQef/xxxcXFKSgoyN2tlQoCi4vy8/M1ffp0LV68WKmpqQoKClK3bt307LPPqmzZsu5uzyPceuutWr9+vf3eQS1bttTXX38ty7JkWZYeeughZWZmurlLADBLXl6eVq1apRdffFEHDhzQgw8+qD59+iguLu6GvuyZwOKiAQMGaNOmTRoyZIjCw8OVkpKid999V/fee6+mTJni7vY8Qs2aNf/0nKBDhw6VUjcAYL6dO3dq/vz5mjdvnvz8/NSzZ0+FhoZq2rRpqlSp0g290jqBxUUBAQH6+eefHZZIPnnypGrXrs1KtwCAYjVmzBjNmzdPly5dUo8ePdSzZ0+HZTQuXrwof3//G/qGsax066LQ0FBlZmY6BJYLFy4oLCzMjV15pl9++UW+vr4KDAy0jx0/flwZGRn661//6sbOAMAMR44c0QcffKD777+/yNcrVKigTZs2lXJXpYsZFheNGjVK8+bNU9++fRUWFqYjR45o9uzZ6tGjh8MKrax6++fuvPNOLVy4ULfffrt97MCBA3rsscf0448/uq8xADBMenq6fbHSP94E8UZHYHFRixYt/nQbm82mtWvXlkI3nu1KVwNxlRAA/O748ePq2bOnNmzYIH9/f6Wnp+uBBx7Qxx9/rODgYHe3Vyo4JOSidevWubuFG0aNGjX0/fff695777WPff/99xxeA4D/55lnnlGdOnX0xRdf6NZbb9X58+c1cuRIPfvsszf0ibaXY4blOqSnp2v//v06f/68w3hsbKybOvJMCxYs0D/+8Q/1799fkZGROnTokKZNm6aJEyeqe/fu7m4PANwuICBAqamp8vLyso9lZWUpJCTkprnQgxkWF02cOFEjRoxQ9erVVaFCBfu4zWYjsDgpLi5OYWFh+uijj7R582aFh4fr888/V9OmTd3dGgAYoWrVqtq+fbvD/dV27typqlWrurGr0sUMi4sCAgK0du1aNWjQwN2tAABucAsXLlS/fv306KOPqnr16kpKStKSJUs0ffr0m+bijht3SbwSFhgYqJo1a7q7jRvCxYsX9eKLL6pmzZry9fWVJK1atUoTJ050b2MAYIhHH31UiYmJql69uo4dO6YaNWro22+/vWnCisQMi8u2bNmit99+W4899lihKbkrXSePov39739XTk6Ohg0bppiYGJ05c0apqalq0aKF9u/f7+72AAAG4BwWF23atElLly7VDz/8UOgclr1797qxM8+zYsUKpaSkyMfHx75Uf3BwsFJTU93cGQC4T+vWrf/09iWSlJCQUArduB+BxUWjR4/Wt99+q0aNGrm7FY9XuXJlnTx50uEy5kOHDikkJMSNXQGAe3GVpCMCi4sCAwNVu3Ztd7dxQxg4cKDat2+vkSNHKi8vT8uXL9fYsWM1cOBAd7cGAG7Tu3dvd7dgFM5hcdG7776rlStXauDAgapWrZrDa/fcc4+buvJcCxcu1OzZs5WcnKzQ0FD17dtXcXFx7m4LAIyQn5+v6dOna/HixUpNTVVQUJC6deumZ599VmXLlnV3e6WCwOKiiIgI+9c2m00Ff402m00HDx50V1s3jNzcXH388cfq06ePu1sBALcbMGCANm3apCFDhig8PFwpKSl69913de+992rKlCnubq9UEFiuw/Hjx7Vy5UodP35cL730klJSUpSfn68aNWq4uzWP8Z///Ec//vijatWqpU6dOiknJ0dTp07VW2+9pSpVqmjXrl3ubhEA3C4gIEA///yzww0PT548qdq1a980K92yDouLVq9erfr162vFihUaN26cJCkpKUnPPfecmzvzHBMmTFDHjh312WefqVevXvrHP/6hJk2a6Msvv9SsWbMIKwDw/4SGhiozM9Nh7MKFCzfVPdeYYXFR3bp1NXv2bDVp0kR+fn46c+aMsrOzFRYWphMnTri7PY8QERGhxYsX6+6779aWLVvUpEkTTZkyhdAHAH8watQozZs3T3379lVYWJiOHDmi2bNnq0ePHqpbt659uxt5ITkCi4sCAgJ08uRJ2Ww2ValSRadPn1ZOTo5CQ0MJLNfI19dXGRkZ9ucFdyAFADhq0aLFn25js9m0du3aUujGPbis2UX33nuvPvjgA8XHx9vH5syZo+bNm7uxK89iWZZOnjxpP2HZ29vb4bmkQldgAcDNaN26de5uwe2YYXFRSkqK2rdvr9zcXB04cEANGjRQTk6Oli9frvDwcHe35xHKlCnjcIXVH9lsNuXl5ZVyVwBgpvT0dO3fv7/QTHRsbKybOipdBJbrYFmWNm/ebF87pEmTJjfN9fAAgNIzceJEjRgxQtWrVy90O5jt27e7sbPSQ2CBMVJSUpSamqrg4GBmqQDgMgEBAVq7dq0aNGjg7lbchsua4Xa//PKL7rnnHkVFRal79+6KiopS48aNdeDAAXe3BgBGCAwMVM2aNd3dhlsxwwK3u/fee/XII49oxIgR8vb2Vk5Ojt544w0lJCTohx9+cHd7AOB2W7Zs0dtvv63HHntMVatWdXjt/vvvd1NXpYvAArfz9fXV6dOnVa7c/79oLTc3V1WqVHG47BkAblb/+te/9NJLLykoKKjQOSx79+51Y2elh8ACt3vqqaf0t7/9TU8++aR97JNPPtG6des0e/ZsN3YGAGaoXLmyvv76azVq1MjdrbgNgQVu0bp1a9lsNkm/z6asW7dOkZGRCg8P15EjR/Trr7/qb3/7m1atWuXmTgHA/e644w5t27ZNt912m7tbcRsWjoNbdO/e3eH5E0884aZOAMB8zz77rDp37qyBAwcWWlDznnvucVNXpYsZFgAADBcREWH/+vIFN202mw4ePOiutkoVgQVu179//yu+NnXq1FLsBADMdfz4ca1cuVLHjx/XSy+9pJSUFOXn56tGjRrubq1UsA4L3C4wMNDhYVmWFi1aJC8vL3e3BgBGWL16terXr68VK1Zo3LhxkqSkpKSb6u72zLDASD/++KOGDRumr776yt2tAIDb1a1bV7Nnz1aTJk3k5+enM2fOKDs7W2FhYTpx4oS72ysVBBYYKTs7W1WrVtXZs2fd3QoAuF1AQIBOnjwpm82mKlWq6PTp08rJyVFoaOhNE1i4Sghu99lnnzk8v3DhghYuXKjGjRu7qSMAMMu9996rDz74QPHx8faxOXPmqHnz5m7sqnQxwwK3a9GihcPzW2+9VQ0bNtSgQYPk7+/vpq4AwBwpKSlq3769cnNzdeDAATVo0EA5OTlavnz5TXOzWAILAAAewLIsbd68WcnJyQoNDVWTJk1UtmxZd7dVaggsMMKePXu0ZMkSpaamKigoSF27dlXdunXd3RYAwBBc1gy3mzFjhpo3b67k5GQFBQUpJSVFMTExmjFjhrtbAwAYghkWuF14eLiWLl2qu+++2z62Y8cOtWvXTkePHnVjZwAAUzDDAiNERUU5PL/99ttVpgz/ewIAfsdPBLjd0KFD1b17d/3www86cuSIvv/+e/Xs2VPDhg3TiRMn7A8AwM2LQ0Jwu2uZSbHZbMrLyyuFbgAAJiKwAAAA43FICAAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAw3v8FQhb3mJzithAAAAAASUVORK5CYII=\n"
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 最重要的是对缺失值和异常值的处理，那么来看看哪些特征的缺失值和异常值最多：\n",
    "missing = data.isnull().sum() / len(data)\n",
    "missing = missing[missing > 0 ]\n",
    "missing.sort_values(inplace = True)\n",
    "x = np.arange(len(missing))\n",
    "fig, ax = plt.subplots()\n",
    "ax.bar(x,missing)\n",
    "ax.set_xticks(x)\n",
    "ax.set_xticklabels(list(missing.index), rotation = 90, fontsize = \"small\")"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:34.278578600Z",
     "start_time": "2024-09-27T12:57:33.648722700Z"
    }
   },
   "id": "94ef2e3e86bea43e"
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "outputs": [],
   "source": [
    "#  可以发现那些匿名特征的异常值都是很多的，还有employmentLength特征的异常值也很多。后续会进行处理。\n",
    "\n",
    "# 另外，还有很多特征并不是能够直接用来训练的特征，因此需要对其进行处理，比如grade(贷款等级)、subGrade(贷款等级之子级)、employmentLength(就业年限（年）)、issueDate(贷款发放的月份)、earliesCreditLine(借款人最早报告的信用额度开立的月份)\n",
    "# 需要进行预处理."
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:34.295559100Z",
     "start_time": "2024-09-27T12:57:34.278578600Z"
    }
   },
   "id": "5d82b9060b48807d"
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['A', 'B', 'C', 'D', 'E', 'F', 'G']\n",
      "['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']\n"
     ]
    }
   ],
   "source": [
    "print(sorted(data['grade'].unique()))\n",
    "print(sorted(data['subGrade'].unique()))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:34.358242500Z",
     "start_time": "2024-09-27T12:57:34.295559100Z"
    }
   },
   "id": "ee205992c6795dd"
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "outputs": [
    {
     "data": {
      "text/plain": "1 year        65671\n10+ years    328525\n2 years       90565\n3 years       80163\n4 years       59818\n5 years       62645\n6 years       46582\n7 years       44230\n8 years       45168\n9 years       37866\n< 1 year      80226\nNaN           58541\nName: employmentLength, dtype: int64"
     },
     "execution_count": 253,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 那么现在先对employmentLength(就业年限（年）)特征进行处理：  \n",
    "data['employmentLength'].value_counts(dropna=False).sort_index()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:34.392300600Z",
     "start_time": "2024-09-27T12:57:34.358242500Z"
    }
   },
   "id": "f1b8ef68ce301e22"
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "outputs": [],
   "source": [
    "# 对employmentLength该列进行处理\n",
    "data[\"employmentLength\"].replace(to_replace=\"10+ years\", value = \"10 years\",\n",
    "                                      inplace = True)\n",
    "data[\"employmentLength\"].replace(to_replace=\"< 1 year\", value = \"0 years\",\n",
    "                                      inplace = True)\n",
    "def employmentLength_to_int(s):\n",
    "    if pd.isnull(s):\n",
    "        return s # 如果是nan还是nan\n",
    "    else:\n",
    "        return np.int8(s.split()[0])  # 按照空格分隔得到第一个字符\n",
    "    \n",
    "data[\"employmentLength\"] = data[\"employmentLength\"].apply(employmentLength_to_int)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:37.638742100Z",
     "start_time": "2024-09-27T12:57:34.392300600Z"
    }
   },
   "id": "85c54c9faec46a37"
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "outputs": [
    {
     "data": {
      "text/plain": "677       Apr-2011\n759960    Jun-1997\n458335    Oct-1999\n442486    Feb-2012\n800752    Nov-1988\nName: earliesCreditLine, dtype: object"
     },
     "execution_count": 255,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 对earliesCreditLine这个时间列进行处理：\n",
    "data['earliesCreditLine'].sample(5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:37.678362400Z",
     "start_time": "2024-09-27T12:57:37.642264300Z"
    }
   },
   "id": "6fdd0e9aefc6406e"
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "outputs": [],
   "source": [
    "# 只选取年份\n",
    "data[\"earliesCreditLine\"] = data[\"earliesCreditLine\"].apply(lambda x:int(x[-4:]))"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:37.938337500Z",
     "start_time": "2024-09-27T12:57:37.672733Z"
    }
   },
   "id": "bfcca10ad9f72a85"
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "outputs": [
    {
     "data": {
      "text/plain": "1944        2\n1945        1\n1946        2\n1949        1\n1950        7\n        ...  \n2011    12282\n2012     8304\n2013     4375\n2014     1863\n2015      251\nName: earliesCreditLine, Length: 70, dtype: int64"
     },
     "execution_count": 257,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data['earliesCreditLine'].value_counts(dropna=False).sort_index()"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:37.954173300Z",
     "start_time": "2024-09-27T12:57:37.941701900Z"
    }
   },
   "id": "bfeaffb2ee6af7a4"
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "grade  类型数目为： 7\n",
      "subGrade  类型数目为： 35\n",
      "employmentTitle  类型数目为： 298101\n",
      "homeOwnership  类型数目为： 6\n",
      "verificationStatus  类型数目为： 3\n",
      "purpose  类型数目为： 14\n",
      "postCode  类型数目为： 935\n",
      "regionCode  类型数目为： 51\n",
      "applicationType  类型数目为： 2\n",
      "initialListStatus  类型数目为： 2\n",
      "title  类型数目为： 47903\n",
      "policyCode  类型数目为： 1\n"
     ]
    }
   ],
   "source": [
    "# 对一些类别的特征进行处理，争取将其转换为ont-hot向量\n",
    "cate_features = [\"grade\",\n",
    "                \"subGrade\",\n",
    "                \"employmentTitle\",\n",
    "                \"homeOwnership\",\n",
    "                \"verificationStatus\",\n",
    "                \"purpose\",\n",
    "                \"postCode\",\n",
    "                \"regionCode\",\n",
    "                \"applicationType\",\n",
    "                \"initialListStatus\",\n",
    "                \"title\",\n",
    "                \"policyCode\"]\n",
    "for fea in cate_features:\n",
    "    print(fea, \" 类型数目为：\", data[fea].nunique())"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:38.088413700Z",
     "start_time": "2024-09-27T12:57:37.974479400Z"
    }
   },
   "id": "61b04086bf5b3555"
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "outputs": [],
   "source": [
    "# 可以看到其中一些特征的类别数目比较少，就适合转换成one-hot向量，但是那些类别数目特别多的就不适合，那么参考baseline采取的做法就是增加计数和排序两类特征"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:38.102358300Z",
     "start_time": "2024-09-27T12:57:38.088413700Z"
    }
   },
   "id": "8baa38f3e22740d1"
  },
  {
   "cell_type": "code",
   "execution_count": 260,
   "outputs": [],
   "source": [
    "data = pd.get_dummies(data, columns = ['grade', 'subGrade', \n",
    "                                             'homeOwnership', 'verificationStatus', \n",
    "                                             'purpose', 'regionCode'],\n",
    "                     drop_first = True)\n",
    "# drop_first就是k个类别，我只用k-1个来表示，那个没有表示出来的类别就是全0"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:38.778226300Z",
     "start_time": "2024-09-27T12:57:38.102358300Z"
    }
   },
   "id": "3dab9643c5113895"
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "outputs": [],
   "source": [
    "# 高维类别特征需要进行转换\n",
    "for f in ['employmentTitle', 'postCode', 'title']:\n",
    "    data[f].fillna(data[f].mode()[0], inplace=True)\n",
    "\n",
    "# 高维类别特征需要进行转换\n",
    "for f in ['employmentTitle', 'postCode', 'title']:\n",
    "    # 计算计数\n",
    "    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')\n",
    "    # 计算排名\n",
    "    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False)\n",
    "    # 处理NaN值\n",
    "    data[f+'_rank'].fillna(0, inplace=True)\n",
    "    data[f+'_cnts'].fillna(0, inplace=True)  # 确保计数也填充NaN值\n",
    "    # 转换为整数类型\n",
    "    data[f+'_rank'] = data[f+'_rank'].astype(int)\n",
    "    data[f+'_cnts'] = data[f+'_cnts'].astype(int)\n",
    "    # 删除原始列\n",
    "    del data[f]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:39.414200200Z",
     "start_time": "2024-09-27T12:57:38.812100200Z"
    }
   },
   "id": "4e50ab21088638d5"
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "outputs": [
    {
     "data": {
      "text/plain": "(1000000, 154)"
     },
     "execution_count": 262,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:39.432809400Z",
     "start_time": "2024-09-27T12:57:39.418213100Z"
    }
   },
   "id": "73a0696587568fc3"
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "outputs": [
    {
     "data": {
      "text/plain": "   id  loanAmnt  term  interestRate  installment  employmentLength  \\\n0   0   35000.0     5         19.52       917.97               2.0   \n1   1   18000.0     5         18.49       461.90               5.0   \n2   2   12000.0     5         16.99       298.17               8.0   \n3   3   11000.0     3          7.26       340.96              10.0   \n4   4    3000.0     3         12.99       101.07               NaN   \n\n   annualIncome   issueDate    dti  delinquency_2years  ...  regionCode_47  \\\n0      110000.0  2014-07-01  17.05                 0.0  ...              0   \n1       46000.0  2012-08-01  27.83                 0.0  ...              0   \n2       74000.0  2015-10-01  22.77                 0.0  ...              0   \n3      118000.0  2015-08-01  17.21                 0.0  ...              0   \n4       29000.0  2016-03-01  32.16                 0.0  ...              0   \n\n   regionCode_48  regionCode_49  regionCode_50  employmentTitle_cnts  \\\n0              0              0              0                  1392   \n1              0              0              0                   151   \n2              0              0              0                     2   \n3              0              0              0                     2   \n4              0              0              0                 63979   \n\n   employmentTitle_rank  postCode_cnts  postCode_rank  title_cnts  title_rank  \n0                  1392           2646           2646        8687        8687  \n1                   151           4751           4751          37          37  \n2                     2           2167           2167      491401      491401  \n3                     2            689            689      185386      185386  \n4                 63979           2161           2161        5896        5896  \n\n[5 rows x 154 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>loanAmnt</th>\n      <th>term</th>\n      <th>interestRate</th>\n      <th>installment</th>\n      <th>employmentLength</th>\n      <th>annualIncome</th>\n      <th>issueDate</th>\n      <th>dti</th>\n      <th>delinquency_2years</th>\n      <th>...</th>\n      <th>regionCode_47</th>\n      <th>regionCode_48</th>\n      <th>regionCode_49</th>\n      <th>regionCode_50</th>\n      <th>employmentTitle_cnts</th>\n      <th>employmentTitle_rank</th>\n      <th>postCode_cnts</th>\n      <th>postCode_rank</th>\n      <th>title_cnts</th>\n      <th>title_rank</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>35000.0</td>\n      <td>5</td>\n      <td>19.52</td>\n      <td>917.97</td>\n      <td>2.0</td>\n      <td>110000.0</td>\n      <td>2014-07-01</td>\n      <td>17.05</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>1392</td>\n      <td>1392</td>\n      <td>2646</td>\n      <td>2646</td>\n      <td>8687</td>\n      <td>8687</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>18000.0</td>\n      <td>5</td>\n      <td>18.49</td>\n      <td>461.90</td>\n      <td>5.0</td>\n      <td>46000.0</td>\n      <td>2012-08-01</td>\n      <td>27.83</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>151</td>\n      <td>151</td>\n      <td>4751</td>\n      <td>4751</td>\n      <td>37</td>\n      <td>37</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>12000.0</td>\n      <td>5</td>\n      <td>16.99</td>\n      <td>298.17</td>\n      <td>8.0</td>\n      <td>74000.0</td>\n      <td>2015-10-01</td>\n      <td>22.77</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2</td>\n      <td>2167</td>\n      <td>2167</td>\n      <td>491401</td>\n      <td>491401</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>11000.0</td>\n      <td>3</td>\n      <td>7.26</td>\n      <td>340.96</td>\n      <td>10.0</td>\n      <td>118000.0</td>\n      <td>2015-08-01</td>\n      <td>17.21</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>2</td>\n      <td>2</td>\n      <td>689</td>\n      <td>689</td>\n      <td>185386</td>\n      <td>185386</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>3000.0</td>\n      <td>3</td>\n      <td>12.99</td>\n      <td>101.07</td>\n      <td>NaN</td>\n      <td>29000.0</td>\n      <td>2016-03-01</td>\n      <td>32.16</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>0</td>\n      <td>63979</td>\n      <td>63979</td>\n      <td>2161</td>\n      <td>2161</td>\n      <td>5896</td>\n      <td>5896</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 154 columns</p>\n</div>"
     },
     "execution_count": 263,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(5)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:39.478184600Z",
     "start_time": "2024-09-27T12:57:39.438315400Z"
    }
   },
   "id": "feb1251bef9f7a63"
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "outputs": [],
   "source": [
    "# 划分为训练数据和测试数据：\n",
    "train = data[data[\"origin\"] == \"train\"].reset_index(drop=True)\n",
    "test = data[data[\"origin\"] == \"test\"].reset_index(drop=True)\n",
    "features = [f for f in data.columns if f not in ['id','issueDate','isDefault',\"origin\"]]  # 这些特征不用参与训练\n",
    "x_train = train[features]\n",
    "y_train = target\n",
    "x_test = test[features]"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:40.326289500Z",
     "start_time": "2024-09-27T12:57:39.448155600Z"
    }
   },
   "id": "21b0849067792f60"
  },
  {
   "cell_type": "markdown",
   "source": [
    "### 模型训练"
   ],
   "metadata": {
    "collapsed": false
   },
   "id": "919a4a3f39514697"
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "outputs": [],
   "source": [
    "# # 设置 XGBoost 参数\n",
    "# xgb_params = {\n",
    "#     'booster': 'gbtree',  # 使用基于树的 boosting 方法\n",
    "#     'objective': 'binary:logistic',  # 学习任务和相应的学习目标（二分类逻辑回归）\n",
    "#     'eval_metric': 'auc',  # 模型评估指标（面积在曲线下）\n",
    "#     'gamma': 1,  # 分裂节点所需的最小损失函数下降值\n",
    "#     'min_child_weight': 1.5,  # 子节点的最小权重和\n",
    "#     'max_depth': 5,  # 树的最大深度\n",
    "#     'lambda': 10,  # L2 正则化项权重（ridge regression）\n",
    "#     'subsample': 0.7,  # 训练每棵树时使用的样本比例\n",
    "#     'colsample_bytree': 0.7,  # 构建树时的特征采样比例\n",
    "#     'colsample_bylevel': 0.7,  # 每一层的特征采样比例\n",
    "#     'eta': 0.04,  # 学习率\n",
    "#     'tree_method': 'exact',  # 构建树的方法（精确贪心算法）\n",
    "#     'seed': 1,  # 随机数种子\n",
    "#     'nthread': 36,  # 并行运行的线程数\n",
    "# }\n",
    "# \n",
    "# # 训练 XGBoost 模型\n",
    "# dtrain = xgb.DMatrix(x_train, y_train)\n",
    "# num_round = 3000\n",
    "# xgb_model = xgb.train(params=xgb_params, dtrain=dtrain, num_boost_round=num_round)\n",
    "# \n",
    "# # 评估模型\n",
    "# valid_xgb = xgb_model.predict(xgb.DMatrix(x_train))\n",
    "# print(\"XGBoost score: {:<8.8f}\".format(roc_auc_score(y_train, valid_xgb)))\n",
    "# \n",
    "# # 在测试集上进行预测\n",
    "# predict_xgb = xgb_model.predict(xgb.DMatrix(x_test))\n",
    "# \n",
    "# # 保存预测结果\n",
    "# testA = pd.read_csv(\"D:\\PycharmProjects\\pythonProject\\data\\\\testA.csv\")\n",
    "# testA['isDefault'] = predict_xgb\n",
    "# submission_data = testA[['id', 'isDefault']]\n",
    "# submission_data.to_csv(\"myresult_xgb.csv\", index=False)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:40.342295300Z",
     "start_time": "2024-09-27T12:57:40.328361400Z"
    }
   },
   "id": "aab3df1b7cdf5a91"
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "E:\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:460: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logistic Regression score: 0.60464398\n"
     ]
    }
   ],
   "source": [
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import roc_auc_score\n",
    "# 设置 Logistic Regression 参数\n",
    "logreg_params = {\n",
    "    'penalty': 'l2',  # 使用 L2 正则化\n",
    "    'C': 1.0,  # 正则化强度的倒数\n",
    "    'solver': 'lbfgs',  # 优化算法\n",
    "    'max_iter': 100,  # 最大迭代次数\n",
    "    'random_state': 1  # 随机数种子\n",
    "}\n",
    "# 创建一个简单的填充器，这里使用均值填补\n",
    "imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
    "# 创建 Logistic Regression 模型\n",
    "logreg_model = LogisticRegression(**logreg_params)\n",
    "pipeline = make_pipeline(imputer, logreg_model)\n",
    "# 训练 Logistic Regression 模型\n",
    "pipeline.fit(x_train, y_train)\n",
    "\n",
    "# 评估模型\n",
    "valid_logreg = pipeline.predict_proba(x_train)[:, 1]\n",
    "print(\"Logistic Regression score: {:<8.8f}\".format(roc_auc_score(y_train, valid_logreg)))\n",
    "\n",
    "# 在测试集上进行预测\n",
    "predict_logreg = pipeline.predict_proba(x_test)[:, 1]\n",
    "# 保存预测结果\n",
    "testA = pd.read_csv(\"D:\\PycharmProjects\\pythonProject\\data\\\\testA.csv\")\n",
    "testA['isDefault'] = predict_logreg\n",
    "submission_data = testA[['id', 'isDefault']]\n",
    "submission_data.to_csv(\"myresult_logreg.csv\", index=False)\n"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2024-09-27T12:57:52.698502200Z",
     "start_time": "2024-09-27T12:57:40.342295300Z"
    }
   },
   "id": "bb4ec32567ffdbb2"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
